{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# COCO data preprocessing\n",
    "\n",
    "This code will download the caption anotations for coco and preprocess them into an hdf5 file and a json file. \n",
    "\n",
    "These will then be read by the COCO data loader in Lua and trained on."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# lets download the annotations from http://mscoco.org/dataset/#download\n",
    "import os\n",
    "os.system('wget http://msvocds.blob.core.windows.net/annotations-1-0-3/captions_train-val2014.zip') # ~19MB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "os.system('unzip captions_train-val2014.zip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import json\n",
    "val = json.load(open('annotations/captions_val2014.json', 'r'))\n",
    "train = json.load(open('annotations/captions_train2014.json', 'r'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[u'info', u'images', u'licenses', u'annotations']\n",
      "{u'description': u'This is stable 1.0 version of the 2014 MS COCO dataset.', u'url': u'http://mscoco.org', u'version': u'1.0', u'year': 2014, u'contributor': u'Microsoft COCO group', u'date_created': u'2015-01-27 09:11:52.357475'}\n",
      "40504\n",
      "202654\n",
      "{u'license': 3, u'file_name': u'COCO_val2014_000000391895.jpg', u'coco_url': u'http://mscoco.org/images/391895', u'height': 360, u'width': 640, u'date_captured': u'2013-11-14 11:18:45', u'flickr_url': u'http://farm9.staticflickr.com/8186/8119368305_4e622c8349_z.jpg', u'id': 391895}\n",
      "{u'image_id': 203564, u'id': 37, u'caption': u'A bicycle replica with a clock as the front wheel.'}\n"
     ]
    }
   ],
   "source": [
    "print val.keys()\n",
    "print val['info']\n",
    "print len(val['images'])\n",
    "print len(val['annotations'])\n",
    "print val['images'][0]\n",
    "print val['annotations'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "\n",
    "# combine all images and annotations together\n",
    "imgs = val['images'] + train['images']\n",
    "annots = val['annotations'] + train['annotations']\n",
    "\n",
    "# for efficiency lets group annotations by image\n",
    "itoa = {}\n",
    "for a in annots:\n",
    "    imgid = a['image_id']\n",
    "    if not imgid in itoa: itoa[imgid] = []\n",
    "    itoa[imgid].append(a)\n",
    "\n",
    "# create the json blob\n",
    "out = []\n",
    "for i,img in enumerate(imgs):\n",
    "    imgid = img['id']\n",
    "    \n",
    "    # coco specific here, they store train/val images separately\n",
    "    loc = 'train2014' if 'train' in img['file_name'] else 'val2014'\n",
    "    \n",
    "    jimg = {}\n",
    "    jimg['file_path'] = os.path.join(loc, img['file_name'])\n",
    "    jimg['id'] = imgid\n",
    "    \n",
    "    sents = []\n",
    "    annotsi = itoa[imgid]\n",
    "    for a in annotsi:\n",
    "        sents.append(a['caption'])\n",
    "    jimg['captions'] = sents\n",
    "    out.append(jimg)\n",
    "    \n",
    "json.dump(out, open('coco_raw.json', 'w'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}\n"
     ]
    }
   ],
   "source": [
    "# lets see what they look like\n",
    "print out[0]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
